package nl.us2.cloudpelican.stormprocessor;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.spout.SchemeAsMultiScheme;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;
import com.google.gson.JsonObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import storm.kafka.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.TimeZone;
/**
* Created by robin on 07/06/15.
*/
public class Main {
public static String KAFKA_SPOUT = "kafka_spout";
public static String PARSE_BOLT = "parse_bolt";
public static String MATCH_BOLT = "match_bolt";
public static String SUPERVISOR_RESULT_WRITER = "supervisor_result_writer";
public static String ROLLUP_STATS = "rollup_stats";
public static String SUPERVISOR_STATS_WRITER = "supervisor_stats_writer";
public static String ERROR_CLASSIFIER_BOLT = "error_classifier";
public static String OUTLIER_DETECTION = "outlier_detection";
public static String OUTLIER_COLLECTOR = "outlier_collector";
private static boolean isRunning = true;
private static final Logger LOG = LoggerFactory.getLogger(Main.class);
public static final int GLOBAL_CONCURRENCY = 6;
public static void main(String [] args) throws Exception
{
ArrayList<String> argList = new ArrayList<String>();
for (String arg : args) {
argList.add(arg);
}
// Config
HashMap<String, String> argsMap = new HashMap<String, String>();
for (String arg : argList) {
String[] split = arg.split("=", 2);
if (split.length == 2 && split[0].trim().length() > 0 && split[1].trim().length() > 0) {
if (split[0].equals("-zookeeper")) {
argsMap.put("zookeeper_nodes", split[1]);
} else if (split[0].equals("-grep")) {
argsMap.put("match_regex", split[1]);
} else if (split[0].equals("-topic")) {
argsMap.put("kafka_topic", split[1]);
} else if (split[0].equals("-supervisor-host")) {
argsMap.put("supervisor_host", split[1]);
} else if (split[0].equals("-supervisor-username")) {
argsMap.put("supervisor_username", split[1]);
} else if (split[0].equals("-supervisor-password")) {
argsMap.put("supervisor_password", split[1]);
} else if (split[0].equals("-conf")) {
argsMap.put("conf_path", split[1]);
} else if (split[0].startsWith("-")) {
// Default
argsMap.put(split[0].substring(1), split[1]);
}
}
}
// Default settings
if (!argsMap.containsKey("kafka_consumer_id")) {
argsMap.put("kafka_consumer_id", "cloudpelican_lsd_consumer");
}
// Settings object
Settings settings = new Settings();
JsonObject settingsData = new JsonObject();
// Add light settings to json
for (Map.Entry<String, String> kv : argsMap.entrySet()) {
settingsData.addProperty(kv.getKey(), kv.getValue());
}
// Debug & load
LOG.info(settingsData.toString());
settings.load(settingsData);
// Topology
TopologyBuilder builder = new TopologyBuilder();
// Time
TimeZone.setDefault(TimeZone.getTimeZone("Etc/UTC"));
// Read from kafka
BrokerHosts hosts = new ZkHosts(settings.get("zookeeper_nodes"));
SpoutConfig spoutConfig = new SpoutConfig(hosts, settings.get("kafka_topic"), "/" + settings.get("kafka_topic"), settings.get("kafka_consumer_id"));
spoutConfig.startOffsetTime = kafka.api.OffsetRequest.EarliestTime();
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
int kafkaPartitions = Integer.parseInt(settings.getOrDefault("kafka_partitions", "3"));
builder.setSpout(KAFKA_SPOUT, kafkaSpout, kafkaPartitions);
// Parse bolt
builder.setBolt(PARSE_BOLT, new ParseBolt(settings), GLOBAL_CONCURRENCY * 1).shuffleGrouping(KAFKA_SPOUT); // No local to prevent hotspots
// Match bolt
builder.setBolt(MATCH_BOLT, new MatchBolt(settings), GLOBAL_CONCURRENCY * 6).shuffleGrouping(PARSE_BOLT, "messages"); // No local to prevent hotspots
// Error classifier bolt
builder.setBolt(ERROR_CLASSIFIER_BOLT, new ErrorClassifierBolt(settings), GLOBAL_CONCURRENCY * 1).fieldsGrouping(MATCH_BOLT, new Fields("filter_id"));
// Supervisor result writer bolt
builder.setBolt(SUPERVISOR_RESULT_WRITER, new SupervisorResultWriterBolt(settings), GLOBAL_CONCURRENCY * 1).shuffleGrouping(MATCH_BOLT); // For efficiency fields grouping would have been better but creates hotspots
// Supervisor stats writer bolt
builder.setBolt(ROLLUP_STATS, new RollupStatsBolt(settings), concurrency(1, 2)).fieldsGrouping(MATCH_BOLT, "match_stats", new Fields("filter_id")).fieldsGrouping(ERROR_CLASSIFIER_BOLT, "error_stats", new Fields("filter_id"));
builder.setBolt(SUPERVISOR_STATS_WRITER, new SupervisorStatsWriterBolt(settings), concurrency(1, 4)).fieldsGrouping(ROLLUP_STATS, "rollup_stats", new Fields("filter_id"));
// Outlier detection bolts (sharded by filter ID)
if (Boolean.parseBoolean(settings.getOrDefault("outlier_detection_enabled", "true"))) {
builder.setBolt(OUTLIER_DETECTION, new OutlierDetectionBolt(settings), GLOBAL_CONCURRENCY * 2).fieldsGrouping(MATCH_BOLT, "dispatch_outlier_checks", new Fields("filter_id"));
builder.setBolt(OUTLIER_COLLECTOR, new OutlierCollectorBolt(settings), concurrency(1, 10)).shuffleGrouping(OUTLIER_DETECTION, "outliers");
}
// Sink
if (settings.get("sinks") != null) {
String[] sinkIds = settings.get("sinks").split(",");
for (String sinkId : sinkIds) {
// Type
String sinkType = settings.get("sinks." + sinkId + ".type");
AbstractSinkBolt sinkBolt = null;
// @todo Sink factory if we have multiple types
if (sinkType == null) {
throw new Exception("Sink '" + sinkId + "' invalid");
} else if (sinkType.equalsIgnoreCase("bigquery")) {
// Google BigQuery sink
sinkBolt = new BigQuerySinkBolt(sinkId, settings);
} else {
throw new Exception("Sink type '" + sinkType + "' not supported");
}
// Add to topology
if (sinkBolt != null) {
String sinkName = "sink_" + sinkType + "_" + sinkId;
LOG.info("Setting up sink '" + sinkName + "'");
if (!sinkBolt.isValid()) {
LOG.error("Sink '" + sinkName + "' not valid");
}
builder.setBolt(sinkName, sinkBolt, GLOBAL_CONCURRENCY * 2).shuffleGrouping(MATCH_BOLT); // For efficiency fields grouping would have been better but creates hotspots
}
}
}
// Debug on for testing
Config conf = new Config();
conf.setDebug(false);
conf.setMessageTimeoutSecs(120); // Default is 30 seconds, which might be too short under peak load spikes, or when we run the outlier detection
String topologyName = settings.getOrDefault("topology_name", "cloudpelican_stormprocessor");
if (argList.contains("-submit")) {
conf.setNumWorkers(GLOBAL_CONCURRENCY);
conf.setNumAckers(GLOBAL_CONCURRENCY); // ackers = workers means every VM has an acker reducing overhead
conf.setMaxSpoutPending(GLOBAL_CONCURRENCY * Integer.parseInt(settings.getOrDefault("topology_max_spout_multiplier", "1000")) * kafkaPartitions);
conf.setStatsSampleRate(Double.parseDouble(settings.getOrDefault("topology_stats_sample_rate", "0.05")));
StormSubmitter.submitTopologyWithProgressBar(topologyName, conf, builder.createTopology());
} else {
LocalCluster cluster = new LocalCluster();
cluster.submitTopology(topologyName, conf, builder.createTopology());
// Keep running until interrupt
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
LOG.info("Shutting down");
isRunning = false;
}
});
while (isRunning) {
Thread.sleep(100);
}
cluster.killTopology(topologyName);
cluster.shutdown();
}
}
public static int concurrency(int min, int part) {
return Math.max(Math.round(GLOBAL_CONCURRENCY / part), min);
}
}